import os
import sys
import pybedtools
from numpy import *

if len(sys.argv) > 1:
    datasets = tuple(sys.argv[1:])
    filename = "peaks.%s.bed" % ("_".join(datasets))
else:
    datasets = ("HiSeq", "CAGE")
    filename = "peaks.bed"

print("Reading", filename)
peaks = pybedtools.BedTool(filename)
indices = {}
for index, line in enumerate(peaks):
    indices[line.name] = index


timepoints = (0, 1, 4, 12, 24, 96)
libraries = {}
for dataset in datasets:
    if dataset == "StartSeq":
        libraries[dataset] = []
    else:
        libraries[dataset] = [list() for timepoint in timepoints]

if "CAGE" in datasets:
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/CAGE/Fasta/"
    filenames = os.listdir(directory)
    for filename in filenames:
        terms = filename.split(".")
        assert terms[1] == "fa"
        assert terms[2] == "gz"
        library = terms[0]
        timepoint, replicate = library.rsplit("_", 1)
        assert replicate in "ABCDEFGH"
        timepoint, hr = timepoint.split("_")
        assert hr == 'hr'
        timepoint = int(timepoint)
        index = timepoints.index(timepoint)
        libraries['CAGE'][index].append(library)

if "MiSeq" in datasets:
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/MiSeq/Fastq/"
    filenames = os.listdir(directory)
    for filename in filenames:
        terms = filename.split(".")
        assert terms[1] == "fq"
        assert terms[2] == "gz"
        library, readno = terms[0].rsplit("_", 1)
        assert readno in ("READ1", "READ2")
        if readno == "READ2":
            continue
        timepoint, replicate = library.rsplit("_", 1)
        assert replicate in ("r1", "r2", "r3")
        if not timepoint.startswith("t"):
            # Knockdown sample
            continue
        timepoint = int(timepoint[1:])
        if timepoint == 1 and replicate == "r3":
            # negative control using water instead of RNA as starting material
            # in HiSeq; don't include in MiSeq either.
            continue
        index = timepoints.index(timepoint)
        libraries["MiSeq"][index].append(library)

if "HiSeq" in datasets:
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/HiSeq/Fastq/"
    filenames = os.listdir(directory)
    for filename in filenames:
        terms = filename.split(".")
        assert terms[1] == "fq"
        assert terms[2] == "gz"
        library = terms[0]
        timepoint, replicate = library.rsplit("_", 1)
        assert replicate in ("r1", "r2", "r3")
        assert timepoint.startswith("t")
        timepoint = int(timepoint[1:])
        if timepoint == 1 and replicate == "r3":
            # negative control using water instead of RNA as starting material
            continue
        index = timepoints.index(timepoint)
        libraries["HiSeq"][index].append(library)

if "StartSeq" in datasets:
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/StartSeq/Fasta/"
    filenames = os.listdir(directory)
    for filename in filenames:
        terms = filename.split(".")
        assert terms[1] == "fa"
        assert terms[2] == "gz"
        library = terms[0]
        if library in ("SRR7071454", "SRR7071455"):
            # RNA input control libraries
            continue
        libraries['StartSeq'].append(library)


n = len(indices)
m = 0
for dataset in datasets:
    if dataset == "StartSeq":
        m += len(libraries[dataset])
    else:
        m += sum([len(libraries[dataset][i]) for i, timepoint in enumerate(timepoints)])

data = zeros((n, m))

header = []

j = 0
for dataset in datasets:
    if dataset == "StartSeq":
        for library in sorted(libraries[dataset]):
            header.append("%s_%s" % (dataset, library))
            filename = "%s.%s.expression.bed" % (dataset, library)
            print("Reading", filename)
            lines = pybedtools.BedTool(filename)
            for line in lines:
                count = int(line.score)
                name = line.name
                i = indices[name]
                data[i,j] = count
            j += 1
    else:
        for index, timepoint in enumerate(timepoints):
            for library in sorted(libraries[dataset][index]):
                header.append("%s_%s" % (dataset, library))
                filename = "%s.%s.expression.bed" % (dataset, library)
                print("Reading", filename)
                lines = pybedtools.BedTool(filename)
                for line in lines:
                    count = int(line.score)
                    name = line.name
                    i = indices[name]
                    data[i,j] = count
                j += 1

if datasets == ("HiSeq", "CAGE"):
    filename = "peaks.expression.txt"
else:
    term = "_".join(datasets)
    filename = "peaks.%s.expression.txt" % term
print("Writing", filename)
handle = open(filename, 'wt')
handle.write("peak")
for library in header:
    handle.write("\t" + library)
handle.write("\n")

for name, row in zip(indices, data):
    handle.write(name)
    for count in row:
        handle.write("\t%d" % count)
    handle.write("\n")

handle.close()
